###############################################
#                                             #
#     State Survey Data Cleaning              #
#     Created by Mac Lockhart Nov 30 2022     #
#     Updated Nov 30 2022                     #
#                                             #
###############################################

#This code recodes variables to standard outcomes and adds survey weights to the data

if(Sys.info()[7] == "maclockhart"){
  setwd("/Users/maclockhart/Dropbox/School/RA Work/'22 Seth & Thad/MIT Evolving Election Administration Landscape")
}
library(tidyverse)
library(RColorBrewer)
library(car)
library(data.table)

for(state in c("Colorado", "Georgia", "LA", "Texas")){
  DF<- read_csv(paste0("Survey Data/", state, ".csv"))
  DF <- DF[c(-1, -2),] #Drop header
  DF <- DF[!is.na(DF$`2024 vote post`),] #Subset to complete respondents to ensure balance
  
  DF <- DF %>% 
    mutate(party = dplyr::recode(political_party,
                                 "1" = "Democrat",
                                 "2" = "Republican",
                                 "3" = "Independent",
                                 "4" = "Independent"),
           race_full=dplyr::recode(ethnicity,
                                   "1" = "White",
                                   "2" = "Black, or African American",
                                   "3" = "American Indian or Alaska Native",
                                   "4" = "Asian *** Asian Indian", 
                                   "5" = "Asian *** Chinese", 
                                   "6" = "Asian *** Filipino", 
                                   "7" = "Asian *** Japanese", 
                                   "8" = "Asian *** Korean", 
                                   "9" = "Asian *** Vietnamese", 
                                   "10" = "Asian *** Other", 
                                   "11" = "Pacific Islander *** Native Hawaiian", 
                                   "12" = "Pacific Islander *** Guamanian", 
                                   "13" = "Pacific Islander *** Samoan", 
                                   "14" = "Pacific Islander *** Other Pacific Islander", 
                                   "15" = "Some other race", 
                                   "16" = "Prefer not to answer"),
           race_short=dplyr::recode(ethnicity,
                                    "1" = "White",
                                    "2" = "Black, or African American",
                                    "3" = "American Indian or Alaska Native",
                                    "4" = "Asian", 
                                    "5" = "Asian", 
                                    "6" = "Asian", 
                                    "7" = "Asian", 
                                    "8" = "Asian", 
                                    "9" = "Asian", 
                                    "10" = "Asian", 
                                    "11" = "Pacific Islander", 
                                    "12" = "Pacific Islander", 
                                    "13" = "Pacific Islander", 
                                    "14" = "Pacific Islander", 
                                    "15" = "Some other race", 
                                    "16" = "Prefer not to answer"),
           ethnicity_full=dplyr::recode(hispanic,
                                        "1" = "Not Hispanic",
                                        "2" = "Hispanic - Mexican, Mexican American, Chicano",
                                        "3" = "Hispanic - Cuban",
                                        "4" = "Hispanic - Argentina",
                                        "5" = "Hispanic - Colombia",
                                        "6" = "Hispanic - Ecuador",
                                        "7" = "Hispanic - El Salvadore",
                                        "8" = "Hispanic - Guatemala",
                                        "9" = "Hispanic - Nicaragua",
                                        "10" = "Hispanic - Panama",
                                        "11" = "Hispanic - Peru",
                                        "12" = "Hispanic - Spain",
                                        "13" = "Hispanic - Venezuela",
                                        "14" = "Hispanic - Other Country",
                                        "15" = "Prefer not to answer",
                                        "16" = "Hispanic - Puerto Rican"),
           ethnicity_short=dplyr::recode(hispanic,
                                         "1" = "Not Hispanic",
                                         "2" = "Hispanic",
                                         "3" = "Hispanic",
                                         "4" = "Hispanic",
                                         "5" = "Hispanic",
                                         "6" = "Hispanic",
                                         "7" = "Hispanic",
                                         "8" = "Hispanic",
                                         "9" = "Hispanic",
                                         "10" = "Hispanic",
                                         "11" = "Hispanic",
                                         "12" = "Hispanic",
                                         "13" = "Hispanic",
                                         "14" = "Hispanic",
                                         "15" = "Prefer not to answer",
                                         "16" = "Hispanic"),
           household_income=dplyr::recode(hhi,
                                          "1" = "Less than $14,999",
                                          "2" = "$15,000 to $19,999",
                                          "3" = "$20,000 to $24,999",
                                          "4" = "$25,000 to $29,999",
                                          "5" = "$30,000 to $34,999",
                                          "6" = "$35,000 to $39,999",
                                          "7" = "$40,000 to $44,999",
                                          "8" = "$45,000 to $49,999",
                                          "9" = "$50,000 to $54,999",
                                          "10" = "$55,000 to $59,999",
                                          "11" = "$60,000 to $64,999",
                                          "12" = "$65,000 to $69,999",
                                          "13" = "$70,000 to $74,999",
                                          "14" = "$75,000 to $79,999",
                                          "15" = "$80,000 to $84,999",
                                          "16" = "$85,000 to $89,999",
                                          "17" = "$90,000 to $94,999",
                                          "18" = "$95,000 to $99,999",
                                          "18" = "$100,000 to $124,999",
                                          "20" = "$125,000 to $149,999",
                                          "21" = "$150,000 to $174,999",
                                          "22" = "$175,000 to $199,999",
                                          "23" = "$200,000 to $249,999",
                                          "24" = "$250,000 and above",
                                          "-3105" = "Prefer not to answer"),
           education_full=dplyr::recode(education,
                                        "1" = "Some high school or less",
                                        "2" = "High school graduate",
                                        "3" = "Other post high school vocational training",
                                        "4" = "Completed some college, but no degree",
                                        "5" = "Associate's degree",
                                        "6" = "Bachelor's degree",
                                        "7" = "Master's or professional degree",
                                        "8" = "Doctorate degree",
                                        "-3105" = "None of the above"),
           d_treatment = dplyr::recode(FL_18_DO,
                                       "NationalTreatment-Control" = 0,
                                       "NationalTreatment-Emotion" = 1,
                                       "NationalTreatment-Facts" = 1),
           treatment = dplyr::recode(FL_18_DO,
                                     "NationalTreatment-Control" = "Control",
                                     "NationalTreatment-Emotion" = "Emotions",
                                     "NationalTreatment-Facts" = "Facts"),
           gender_text = dplyr::recode(gender,
                                       "1" = "Male",
                                       "2" = "Female"))
  
  DF$age <- as.numeric(DF$age)
  
  #State specific coding
  if(state=="Colorado"){
    DF <- DF%>% mutate(
      d_state_treatment = dplyr::recode(FL_17_DO,
                                        "StateControl" = 0,
                                        "ColoradoAbsenteeVotingTreatment" = 1,
                                        "ColoradoFact-BasedTreatment" = 1),
      state_treatment = dplyr::recode(FL_17_DO,
                                      "StateControl" = "Control",
                                      "ColoradoAbsenteeVotingTreatment" = "Colorado - Absentee Voting",
                                      "ColoradoFact-BasedTreatment" = "Colorado - Facts"))
  }
  if(state=="Georgia"){
    DF <- DF%>% mutate(
      d_state_treatment = dplyr::recode(FL_17_DO,
                                        "StateControl" = 0,
                                        "GeorgiaAbsenteeVotingTreatment" = 1,
                                        "GeorgiaVotingSystemTreatment" = 1),
      state_treatment = dplyr::recode(FL_17_DO,
                                      "StateControl" = "Control",
                                      "GeorgiaAbsenteeVotingTreatment" = "Georgia - Absentee Voting",
                                      "GeorgiaVotingSystemTreatment" = "Georgia - Voting System"))
  }
  if(state=="LA"){
    DF <- DF%>% mutate(
      d_state_treatment = dplyr::recode(FL_17_DO,
                                        "StateControl" = 0,
                                        "LAAbsenteeVotingTreatment" = 1,
                                        "LAGOTVTreatment" = 1),
      state_treatment = dplyr::recode(FL_17_DO,
                                      "StateControl" = "Control",
                                      "LAAbsenteeVotingTreatment" = "Los Angeles - Absentee Voting",
                                      "LAGOTVTreatment" = "Los Angeles - GOTV"))
  }
  if(state=="Texas"){
    DF <- DF%>% mutate(
      d_state_treatment = dplyr::recode(FL_17_DO,
                                        "StateControl" = 0,
                                        "TexasAbsenteeVotingTreatment" = 1,
                                        "TexasFoxNewsTreatment" = 1),
      state_treatment = dplyr::recode(FL_17_DO,
                                      "StateControl" = "Control",
                                      "TexasAbsenteeVotingTreatment" = "Texas - Absentee Voting",
                                      "TexasFoxNewsTreatment" = "Texas - Fox"))
  }
  
  #seperate multiple entry choices
  
  #Information sources
  DF$info_youtube<-grepl("YouTube", DF$`Info Sources List`)
  DF$info_tiktok<-grepl("TikTok", DF$`Info Sources List`)
  DF$info_instagram<-grepl("Instagram", DF$`Info Sources List`)
  DF$info_other_sm<-grepl("Another social media network", DF$`Info Sources List`)
  DF$info_twitter<-grepl("Twitter", DF$`Info Sources List`)
  DF$info_television<-grepl("Television", DF$`Info Sources List`)
  DF$info_internet<-grepl("Searching the internet", DF$`Info Sources List`)
  DF$info_facebook <-grepl("Facebook", DF$`Info Sources List`)
  DF$info_radio<-grepl("Radio", DF$`Info Sources List`)
  DF$info_news<-grepl("Newspapers", DF$`Info Sources List`)
  
  DF$sources_election_officials<-grepl("Local and state elections officials", DF$`Information Sources`)
  DF$sources_television<-grepl("Television news in my local area", DF$`Information Sources`)
  DF$sources_fox<-grepl("Fox News", DF$`Information Sources`)
  DF$sources_cnn<-grepl("CNN", DF$`Information Sources`)
  DF$sources_politicians<-grepl("Political leaders in my party", DF$`Information Sources`)
  
  DF$issues_hours<-grepl("Hours available for in-person voting", DF$Issues)
  DF$issues_in_person_locations<-grepl("Accessibility of in-person voting location", DF$Issues)
  DF$issues_assistance<-grepl("Assistance of election workers", DF$Issues)
  DF$issues_language<-grepl("Lack of voting materials in preferred languages", DF$Issues)
  DF$issues_electronic_machines<-grepl("Ease or difficulty of using electronic voting machines", DF$Issues)
  DF$issues_finding_location<-grepl("Locating in-person voting location address", DF$Issues)
  DF$issues_vote_by_mail<-grepl("Ease or difficulty of voting by mail", DF$Issues)
  DF$issues_voting_information<-grepl("Lack of voting information or materials", DF$Issues)
  DF$issues_not_confident_counted<-grepl("Not confident ballot will be counted correctly", DF$Issues)
  DF$issues_long_line<-grepl("A long line at the location where I voted", DF$Issues)
  
  DF$disability_hearing<-grepl("Hearing", DF$Disabilities)
  DF$disability_seeing<-grepl("Seeing", DF$Disabilities)
  DF$disability_walking<-grepl("Walking", DF$Disabilities)
  DF$disability_hands<-grepl("Using your hands", DF$Disabilities)
  DF$disability_reading<-grepl("Reading", DF$Disabilities)
  DF$disability_talking<-grepl("Talking", DF$Disabilities)
  DF$disability_thinking<-grepl("Thinking", DF$Disabilities)
  DF$disability_remembering<-grepl("Remembering", DF$Disabilities)
  DF$disability_none<-grepl("None", DF$Disabilities)
  
  ###Recode outcome variables per the pre-analysis plan
  DF<-DF %>%
    mutate(ownstatepost = dplyr::recode(`State Post Outcome`,
                                        "Trust a lot" = 4,
                                        "Trust some" = 3,
                                        "Distrust some" = 2,
                                        "Distrust a lot" = 1),
           ownstatepre = dplyr::recode(`Trust Own State`,
                                       "Trust a lot" = 4,
                                       "Trust some" = 3,
                                       "Distrust some" = 2,
                                       "Distrust a lot" = 1),
           otherstatepost = dplyr::recode(`Other State Post`,
                                          "Trust a lot" = 4,
                                          "Trust some" = 3,
                                          "Distrust some" = 2,
                                          "Distrust a lot" = 1),
           otherstatepre = dplyr::recode(`Trust Other States`,
                                         "Trust a lot" = 4,
                                         "Trust some" = 3,
                                         "Distrust some" = 2,
                                         "Distrust a lot" = 1),
           votefraudpost = dplyr::recode(`Vote fraud post`,
                                         "Vote fraud almost never occurs" = 5,
                                         "Vote fraud occurs infrequently" = 4,
                                         "Vote fraud occurs about half of the time" = 3,
                                         "Vote fraud is very common" = 2,
                                         "Vote fraud happens all of the time" = 1),
           votefraudpre = dplyr::recode(`Vote fraud`,
                                        "Vote fraud almost never occurs" = 5,
                                        "Vote fraud occurs infrequently" = 4,
                                        "Vote fraud occurs about half of the time" = 3,
                                        "Vote fraud is very common" = 2,
                                        "Vote fraud happens all of the time" = 1),
           officialfraudpost = dplyr::recode(`Official fraud post`,
                                             "Fraud by official state or county election authorities almost never occurs" = 5,
                                             "Fraud by official state or county election authorities occurs infrequently" = 4,
                                             "Fraud by official state or county election authorities occurs about half of the time" = 3,
                                             "Fraud by official state or county election authorities is very common" = 2,
                                             "Fraud by official state or county election authorities happens all of the time" = 1),
           officialfraudpre = dplyr::recode(`Official Fraud`,
                                            "Fraud by official state or county election authorities almost never occurs" = 5,
                                            "Fraud by official state or county election authorities occurs infrequently" = 4,
                                            "Fraud by official state or county election authorities occurs about half of the time" = 3,
                                            "Fraud by official state or county election authorities is very common" = 2,
                                            "Fraud by official state or county election authorities happens all of the time" = 1),
           vote2024post = dplyr::recode(`2024 vote post`,
                                    "Definitely will vote" = 5,
                                    "Probably will vote" = 4,
                                    "May or may not vote" = 3,
                                    "Probably will not vote" = 2,
                                    "Definitely will not vote" = 1),
           vote2024pre = dplyr::recode(`Vote likelihood`,
                                       "Definitely will vote" = 5,
                                       "Probably will vote" = 4,
                                       "May or may not vote" = 3,
                                       "Probably will not vote" = 2,
                                       "Definitely will not vote" = 1))
  
  #Identify bots in every state
  if(state %in% c("Colorado", "Texas")){
    v<-unlist(Map(grepl,DF$`Distrust Open Ended`,"Would you like to share with us another aspect of elections that you do not trust?")) 
    y <- nchar(DF$`Distrust Open Ended`)>5
    DF$bot <- ifelse(v==T & y==T, 1, 0)
    DF$bot <-ifelse(is.na(DF$bot), 0, DF$bot)
  }
  if(state == "Georgia"){
    DF$bot <- ifelse(DF$`Distrust Open Ended`=="OK EM", 1, 0)
    DF$bot <-ifelse(is.na(DF$bot), 0, DF$bot)
  }
  if(state == "LA"){
    v<-unlist(Map(grepl,DF$`Distrust Open Ended`,"Would you like to share with us another aspect of elections that you do not trust?")) 
    y <- nchar(DF$`Distrust Open Ended`)>5
    DF$bot <- ifelse(v==T & y==T, 1, 0)
    DF$bot <- ifelse(DF$`Distrust Open Ended`=="1", 1, DF$bot)
    DF$bot <-ifelse(is.na(DF$bot), 0, DF$bot)
  }
  if(Sys.info()[7] == "maclockhart"){
    source("/Users/maclockhart/Dropbox/School/RA Work/'22 Seth & Thad/Mac and Jen/Code/add_weights.R")
  }else{print("I am not adding weights to your samples")}
  write.csv(DF, paste0("Survey Data/",state," Recoded.csv"))
}
